suppressPackageStartupMessages(library(tidyverse))
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
library(patchwork)
Settings
data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'
wd <- '~/Google Drive/My Drive/Analysis/METTL2A/'
setwd(wd)
figdir <- paste0(wd, 'Figures/Shortread/Mapping/')
dir.create(figdir, showWarnings = FALSE, recursive = TRUE)
#tabledir <- paste0(wd, 'Tables/')
theme_set(
theme_classic(base_size = 7) +
theme(legend.position = 'bottom')
)
theme_mapping <-
theme_classic(base_size = 7) +
theme(
legend.position = 'none',
axis.text.x = element_blank(),
axis.title = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_blank()
)
Functions
paste_wd <- function(path) {
paste0(wd, path)
}
read_bedgz <- function(path) {
read_tsv(path, col_names = c('seq_name', 'start', 'end', 'value'))
}
bed12_to_exon <- function(bed12_df) {
bed12_df |>
mutate(
blockSizes = str_split(blockSizes, ",") %>% map(as.integer),
blockStarts = str_split(blockStarts, ",") %>% map(as.integer)
) %>%
rowwise() %>%
summarise(
chrom = chrom,
start = list(start + unlist(blockStarts)),
end = list(start + unlist(blockSizes)),
name = name,
score = score,
strand = strand
) %>%
unnest(cols = c(start, end))
}
calc_exonpos_onegene <- function(genename) {
transcript2gene |>
filter(gene_name == genename) |>
left_join(transcripts_annotation) |>
mutate(name = paste(transcript_id, transcript_name, sep = '|')) |>
#rename(name = transcript_id) |>
bed12_to_exon()
}
read_bedgz_onegene <- function(bedgz, genename) {
df_filtered <-
transcript2gene |>
filter(gene_name == genename) |>
left_join(transcripts_annotation)
minstart <- min(df_filtered$start, na.rm = TRUE) - 1
maxend <- max(df_filtered$end , na.rm = TRUE)
read_bedgz(bedgz) |>
filter(seq_name == unique(df_filtered$seqname)) |>
filter(end >= minstart & start <= maxend) |>
filter(start >= minstart | end <= maxend) |>
mutate(
start = ifelse(start < minstart, minstart, start),
end = ifelse(end > maxend , maxend, end)
) |>
mutate(
basename = basename(bedgz) |>
str_remove_all(
paste(
c('221223_NovaSeq_SP_TruseqUD_l1_[0-9]+_[A-Z][0-9]+_Dr_Taniue_[0-9]_',
'_S[0-9]{2}_L001_Aligned.sortedByCoord.out_CPM_size1.bdg.gz',
'_No'),
collapse = '|'
)
)
) |>
separate(basename, into = c('type', 'si', 'rep'), sep = '_')
}
calc_m3C_sites_genomicpos <- function(genename) {
transcript2gene |>
filter(gene_name == genename) |>
inner_join(DRS_m3Csites) |>
inner_join(transcripts_annotation) |>
select(
transcript_id, transcript_type, transcript_name,
kmer_middle, strand, start, end, blockSizes, blockStarts
) |>
separate_rows(c(blockSizes, blockStarts)) |>
group_by(transcript_id, transcript_type, transcript_name, kmer_middle) |>
mutate(
# - strandは逆から
exon_num = ifelse(strand == '+', row_number(), max(row_number()) - row_number() + 1)
) |>
arrange(transcript_id, kmer_middle, exon_num) |>
mutate(cum_size = cumsum(blockSizes)) |>
mutate(pos_in_exon = cum_size - kmer_middle) |>
filter(pos_in_exon > 0) |>
filter(pos_in_exon == min(pos_in_exon)) |>
mutate(genomic_pos = start + blockStarts |> as.numeric() + pos_in_exon) |>
select(transcript_id, kmer_middle, genomic_pos) |>
mutate(start = genomic_pos - 1, end = genomic_pos)
}
Read data
transcripts_annotation <-
read_bed12('Tables/Database/Espresso_AsPC1_annotation.bed' |> paste_wd()) |>
rename(transcript_id = name)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
transcripts_annotation
## # A tibble: 36,687 × 12
## chrom start end transcript_id score strand thickStart thickEnd itemRgb
## <chr> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 chrX 2913613 2929275 ENST000002178… 0 - NA NA 255,0,0
## 2 chrX 2914496 2917878 ENST000004813… 0 - NA NA 255,0,0
## 3 chrX 2920380 2929279 ENST000004948… 0 - NA NA 255,0,0
## 4 chrX 2920675 2929339 ENST000005593… 0 - NA NA 255,0,0
## 5 chrX 2951737 2964288 ENST000006827… 0 - NA NA 255,0,0
## 6 chrX 3604339 3713649 ENST000002628… 0 - NA NA 255,0,0
## 7 chrX 3610950 3612255 ENST000004966… 0 - NA NA 255,0,0
## 8 chrX 3817527 3843566 ENST000006620… 0 - NA NA 255,0,0
## 9 chrX 3817527 3843508 ENST000004909… 0 - NA NA 255,0,0
## 10 chrX 3817527 3822622 ENST000004610… 0 - NA NA 255,0,0
## # ℹ 36,677 more rows
## # ℹ 3 more variables: blockCount <dbl>, blockSizes <chr>, blockStarts <chr>
transcript2gene <-
read_tsv(
'Tables/Espresso/espresso_deseq2_genetype2_isDET_2024-04-18.tsv' |> paste_wd()
) |>
select(transcript_id:gene_name, genetype2, common_DETs, seqname)
## Rows: 36717 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (11): transcript_id, transcript_type, transcript_name, gene_id, gene_typ...
## dbl (18): siMETTL2A_baseMean, siMETTL2A_log2FoldChange, siMETTL2A_lfcSE, siM...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
transcript2gene
## # A tibble: 36,717 × 9
## transcript_id transcript_type transcript_name gene_id gene_type gene_name
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ENST00000498442.1 retained_intron CRBN-212 ENSG00… protein_… CRBN
## 2 ENST00000459840.5 retained_intron CRBN-205 ENSG00… protein_… CRBN
## 3 ENST00000231948.9 protein_coding CRBN-201 ENSG00… protein_… CRBN
## 4 ENST00000432408.6 protein_coding CRBN-203 ENSG00… protein_… CRBN
## 5 ENST00000339437.… protein_coding TRNT1-203 ENSG00… protein_… TRNT1
## 6 ENST00000488263.5 retained_intron CRBN-209 ENSG00… protein_… CRBN
## 7 ENST00000420393.5 protein_coding TRNT1-207 ENSG00… protein_… TRNT1
## 8 ENST00000698415.1 retained_intron TRNT1-230 ENSG00… protein_… TRNT1
## 9 ENST00000450014.1 protein_coding CRBN-204 ENSG00… protein_… CRBN
## 10 ENST00000698416.1 retained_intron TRNT1-231 ENSG00… protein_… TRNT1
## # ℹ 36,707 more rows
## # ℹ 3 more variables: genetype2 <chr>, common_DETs <chr>, seqname <chr>
DRS_m3Csites <-
read_tsv(
'Tables/DRS_m3C_sites/DRS_methylated_positions_relative_range_2024-04-22.tsv' |>
paste_wd()
) |>
select(transcript_id, kmer_middle)
## Rows: 489 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): transcript_id, gene_name, seqname, gene_type, ref_kmer, genetype2
## dbl (7): kmer_start, kmer_end, kmer_middle, length, rel_kmer_start, rel_kmer...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
DRS_m3Csites
## # A tibble: 489 × 2
## transcript_id kmer_middle
## <chr> <dbl>
## 1 ENST00000429711.7 425
## 2 ENST00000647248.2 383
## 3 ENST00000647248.2 384
## 4 ENST00000389680.2 60
## 5 ENST00000389680.2 78
## 6 ENST00000389680.2 96
## 7 ENST00000389680.2 151
## 8 ENST00000389680.2 156
## 9 ENST00000389680.2 157
## 10 ENST00000389680.2 158
## # ℹ 479 more rows
S100A4
S100A4 <-
fs::dir_ls(
'/Volumes/Mitsu_NGS_2/METTL2A/Alignment/STAR/Espresso_AsPC1/',
glob = '*.gz'
) |>
map(read_bedgz_onegene, genename = 'S100A4') |>
reduce(bind_rows)
## Joining with `by = join_by(transcript_id)`
## Rows: 27380368 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 27063534 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 29890931 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 27849901 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 27910942 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 26915565 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 24890490 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 21900206 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 20315927 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
S100A4
## # A tibble: 19,116 × 7
## seq_name start end value type si rep
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 chr1 153543611 153543612 0.0649 Cont D N1
## 2 chr1 153543612 153543613 0.110 Cont D N1
## 3 chr1 153543613 153543614 0.123 Cont D N1
## 4 chr1 153543614 153543616 0.149 Cont D N1
## 5 chr1 153543616 153543617 0.227 Cont D N1
## 6 chr1 153543617 153543618 0.473 Cont D N1
## 7 chr1 153543618 153543619 1.19 Cont D N1
## 8 chr1 153543619 153543620 1.32 Cont D N1
## 9 chr1 153543620 153543621 3.51 Cont D N1
## 10 chr1 153543621 153543622 4.92 Cont D N1
## # ℹ 19,106 more rows
S100A4_exons <-
calc_exonpos_onegene('S100A4') |>
mutate(tr_num = name |> factor() |> as.numeric())
## Joining with `by = join_by(transcript_id)`
S100A4_exons
## # A tibble: 39 × 7
## chrom start end name score strand tr_num
## <chr> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
## 1 chr1 153543612 153543923 ENST00000481009.1|S100A4-206 0 - 6
## 2 chr1 153544653 153545097 ENST00000481009.1|S100A4-206 0 - 6
## 3 chr1 153543620 153543923 ENST00000368714.1|S100A4-202 0 - 2
## 4 chr1 153544653 153544809 ENST00000368714.1|S100A4-202 0 - 2
## 5 chr1 153550064 153550136 ENST00000368714.1|S100A4-202 0 - 2
## 6 chr1 153543612 153543923 ENST00000368715.5|S100A4-203 0 - 3
## 7 chr1 153544653 153544809 ENST00000368715.5|S100A4-203 0 - 3
## 8 chr1 153544957 153545063 ENST00000368715.5|S100A4-203 0 - 3
## 9 chr1 153543620 153543923 ENST00000368716.9|S100A4-204 0 - 4
## 10 chr1 153544653 153544809 ENST00000368716.9|S100A4-204 0 - 4
## # ℹ 29 more rows
xlim <- c(min(S100A4$start), max(S100A4$end))
p1 <-
S100A4 |>
filter(rep == 'N1')|>
ggplot(aes()) +
geom_rect(aes(xmin = start, xmax = end, ymin = 0, ymax = value, fill = si)) +
facet_wrap( ~ si, ncol = 1) +
scale_x_continuous(limits = xlim) +
scale_fill_manual(values = c('#8C8C8C', '#37D9CC', '#A3A3F9')) +
theme_mapping
p1

S100A4_transcripts <-
S100A4_exons |>
group_by(name, strand) |>
reframe(start = min(start), end = max(end))
S100A4_transcripts
## # A tibble: 12 × 4
## name strand start end
## <chr> <chr> <dbl> <dbl>
## 1 ENST00000354332.8|S100A4-201 - 153543612 153545802
## 2 ENST00000368714.1|S100A4-202 - 153543620 153550136
## 3 ENST00000368715.5|S100A4-203 - 153543612 153545063
## 4 ENST00000368716.9|S100A4-204 - 153543620 153545806
## 5 ENST00000468373.1|S100A4-205 - 153543612 153545806
## 6 ENST00000481009.1|S100A4-206 - 153543612 153545097
## 7 ESPRESSO:chr1:1668:12|NA - 153543621 153545057
## 8 ESPRESSO:chr1:1668:14|NA - 153543621 153545792
## 9 ESPRESSO:chr1:1668:4|NA - 153543621 153545791
## 10 ESPRESSO:chr1:1668:5|NA - 153543621 153545084
## 11 ESPRESSO:chr1:1668:7|NA - 153543621 153545791
## 12 ESPRESSO:chr1:1668:8|NA - 153543621 153545791
p3 <-
S100A4_exons |>
ggplot(aes(
x = (start + end) / 2,
y = name,
width = (end - start),
)) +
geom_tile(height = .8) +
geom_tile(data = S100A4_transcripts, height = .1) +
scale_x_continuous(limits = xlim) +
theme_mapping
plot <-
p1 / p3 +
plot_layout(heights = c(6,2))
ggsave(
filename = paste0(figdir, 'S100A4_shortread_mapping.pdf'),
plot, width = 18, height = 8, units = 'cm')
S100A4
RPS24 <-
fs::dir_ls(
'/Volumes/Mitsu_NGS_2/METTL2A/Alignment/STAR/Espresso_AsPC1/',
glob = '*.gz'
) |>
map(read_bedgz_onegene, genename = 'RPS24') |>
reduce(bind_rows)
## Joining with `by = join_by(transcript_id)`
## Rows: 27380368 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 27063534 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 29890931 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 27849901 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 27910942 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 26915565 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 24890490 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 21900206 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
## Rows: 20315927 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): seq_name dbl (3): start, end, value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
RPS24
## # A tibble: 21,214 × 7
## seq_name start end value type si rep
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 chr10 78033758 78033758 0.00649 Cont D N1
## 2 chr10 78033758 78033760 0.0195 Cont D N1
## 3 chr10 78033760 78033767 0.0454 Cont D N1
## 4 chr10 78033767 78033771 0.0519 Cont D N1
## 5 chr10 78033771 78033774 0.0584 Cont D N1
## 6 chr10 78033774 78033776 0.0713 Cont D N1
## 7 chr10 78033776 78033777 0.110 Cont D N1
## 8 chr10 78033777 78033791 0.117 Cont D N1
## 9 chr10 78033791 78033799 0.123 Cont D N1
## 10 chr10 78033799 78033802 0.136 Cont D N1
## # ℹ 21,204 more rows
RPS24_exons <-
calc_exonpos_onegene('RPS24') |>
mutate(tr_num = name |> factor() |> as.numeric())
## Joining with `by = join_by(transcript_id)`
RPS24_exons
## # A tibble: 30 × 7
## chrom start end name score strand tr_num
## <chr> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
## 1 chr10 78033885 78033904 ENST00000466129.6|RPS24-207 0 + 3
## 2 chr10 78035351 78035417 ENST00000466129.6|RPS24-207 0 + 3
## 3 chr10 78035510 78036529 ENST00000466129.6|RPS24-207 0 + 3
## 4 chr10 78033916 78033986 ENST00000482069.5|RPS24-212 0 + 6
## 5 chr10 78035351 78035417 ENST00000482069.5|RPS24-212 0 + 6
## 6 chr10 78035510 78035720 ENST00000482069.5|RPS24-212 0 + 6
## 7 chr10 78037193 78037304 ENST00000482069.5|RPS24-212 0 + 6
## 8 chr10 78040614 78040696 ENST00000482069.5|RPS24-212 0 + 6
## 9 chr10 78033862 78033904 ENST00000372360.9|RPS24-202 0 + 1
## 10 chr10 78035351 78035417 ENST00000372360.9|RPS24-202 0 + 1
## # ℹ 20 more rows
xlim <- c(min(RPS24$start), max(RPS24$end))
p1 <-
RPS24 |>
filter(rep == 'N1')|>
ggplot(aes()) +
geom_rect(aes(xmin = start, xmax = end, ymin = 0, ymax = value, fill = si)) +
facet_wrap( ~ si, ncol = 1) +
scale_x_continuous(limits = xlim) +
scale_fill_manual(values = c('#8C8C8C', '#37D9CC', '#A3A3F9')) +
theme_mapping
p1

RPS24_transcripts <-
RPS24_exons |>
group_by(name, strand) |>
reframe(start = min(start), end = max(end))
RPS24_transcripts
## # A tibble: 7 × 4
## name strand start end
## <chr> <chr> <dbl> <dbl>
## 1 ENST00000372360.9|RPS24-202 + 78033862 78040697
## 2 ENST00000435275.5|RPS24-203 + 78033759 78040713
## 3 ENST00000466129.6|RPS24-207 + 78033885 78036529
## 4 ENST00000478655.6|RPS24-210 + 78033862 78037883
## 5 ENST00000480662.2|RPS24-211 + 78038555 78040701
## 6 ENST00000482069.5|RPS24-212 + 78033916 78040696
## 7 ENST00000613865.5|RPS24-214 + 78033759 78040716
p3 <-
RPS24_exons |>
ggplot(aes(
x = (start + end) / 2,
y = name,
width = (end - start),
)) +
geom_tile(height = .8) +
geom_tile(data = RPS24_transcripts, height = .1) +
scale_x_continuous(limits = xlim) +
theme_mapping
plot <-
p1 / p3 +
plot_layout(heights = c(6,2))
ggsave(
filename = paste0(figdir, 'RPS24_shortread_mapping.pdf'),
plot, width = 18, height = 8, units = 'cm')